def tokenize_large_file(filename):
    with open(filename, 'r') as f:
        # 每次读取一行，直到文件结束
        lines = iter(f.readline, '')
        # 将每行分割成单词
        words = (word for line in lines for word in line.split())
        # 过滤掉标点符号
        clean_words = (word.strip(',.!?:;()[]{}"\'-') for word in words)
        # 只保留非空单词
        yield from (word for word in clean_words if word)

# 使用方式
for word in tokenize_large_file('war_and_peace.txt'):
    print(word)